inst/tasks/_trash/country checks and solutions/checking FRA.R

# checking austria coded texts

# setting things up
rm(list = ls())
require(idep)
get_ready()
setwd("Z:/Geschäftsordnungen/codingChanges/FRA/R files")
fnames <- sort(list.files(pattern="coded"))

'%.%'   <- function(a,b) paste0(a,b)
'%..%'  <- function(a,b) paste(a,b)


# missing text ?
for ( i in seq_along(fnames)) {
  load(fnames[i])
  if ( length(unique(RESULTS$oldline)) < max(as.numeric(RESULTS$oldline), na.rm=T) | 
       length(unique(RESULTS$newline)) < max(as.numeric(RESULTS$newline), na.rm=T)){
    message(str_pad(i,3) %..% fnames[i])
    message("lines are missing")
  }
}

# get data
i <- 8
load(fnames[i])


# checking if text in data and that in texts match
texts_names <- 
  str_split(fnames[i], " VS ")  %>% 
  unlist  %>% 
  str_replace_all("coded |\\.Rdata", "") %>% 
  paste0("../TXT/", . ,".txt")

texts_txt <- 
  lapply(texts_names, readLines)

oldtext <- 
  RESULTS  %>% 
  select(oldline, oldtext) %>% 
  filter(!duplicated(oldline), !is.na(oldline)) 

comp1 <- cbind(texts_txt[[1]][oldtext$oldline],oldtext$oldtext)

newtext <- 
  RESULTS  %>% 
  select(newline, newtext) %>% 
  filter(!duplicated(newline), !is.na(newline)) 

comp2 <- cbind(texts_txt[[2]][newtext$newline],newtext$newtext)

# word distances for lines within old data compared to text (should be below 2 or 3)
plot(apply(comp1, 1, text_dist))
# comp1[apply(comp1, 1, text_dist) > 2 ,]

plot(apply(comp2, 1, text_dist))
# comp2[apply(comp2, 1, text_dist) > 2 ,]




# adding missing text // oldlines
RESULTS$Date <- as.character(file.info(fnames[i])$ctime)

index <- seq_along(texts_txt[[1]]) 
index <- index[!(index %in% as.numeric(RESULTS$oldline))]

if ( length(index)>0 ) {
  add <- data.frame(
    oldline = index,
    oldtext = clean_text(texts_txt[[1]][index]),
    similarity = 0,
    relevant=0,
    coder= "Peter",
    Date= as.character(Sys.time())
  )
} else {
  add <- data.frame()
}

RESULTS <- rbind.fill(RESULTS, add)


index <- seq_along(texts_txt[[2]]) 
index <- index[!(index %in% as.numeric(RESULTS$newline))]

if ( length(index) > 0 ) {
  add <- data.frame(
    newline = index,
    newtext = clean_text(texts_txt[[2]][index]),
    similarity = 0,
    relevant=0,
    coder= "Peter",
    Date=as.character(Sys.time())
  )  
} else{
  add <- data.frame()  
}

RESULTS <- rbind.fill(RESULTS, add)

# adding Aorigtext / Borigtext and otehr vars 

if ( is.null(RESULTS$Aorigtext) ) {
  RESULTS$Aorigtext <- 
    str_replace(
      texts_txt[[1]][as.numeric(RESULTS$oldline)]
      , "^ ?#§# ?", "")
}

if ( is.null(RESULTS$Borigtext) ) {
  RESULTS$Borigtext <- 
    str_replace(
      texts_txt[[2]][as.numeric(RESULTS$newline)]
      , "^ ?#§# ?", "")
}

if ( is.null(RESULTS$duration) ) {
  RESULTS$duration <- NA
}

RESULTS$Afile <- str_replace(texts_names,
                             "\\.\\.",
                             "Z:/Geschäftsordnungen/codingChanges/FRA")[1]
RESULTS$Bfile <- str_replace(texts_names,
                             "\\.\\.",
                             "Z:/Geschäftsordnungen/codingChanges/FRA")[2]

if( is.null(RESULTS$comments) ) RESULTS$comments <- ""

# save(RESULTS, file=fnames[i])
petermeissner/idep documentation built on May 25, 2019, 1:53 a.m.